import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px ### for plotting the data on world map
print('Modules are imported.')
Modules are imported.
importing "Covid19_Confirmed_dataset.csv" from "./Dataset" folder.
corona_dataset_csv = pd.read_csv("Datasets/covid19_Confirmed_dataset.csv")
corona_dataset_csv.head(10)
| Province/State | Country/Region | Lat | Long | 1/22/20 | 1/23/20 | 1/24/20 | 1/25/20 | 1/26/20 | 1/27/20 | ... | 4/21/20 | 4/22/20 | 4/23/20 | 4/24/20 | 4/25/20 | 4/26/20 | 4/27/20 | 4/28/20 | 4/29/20 | 4/30/20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | Afghanistan | 33.0000 | 65.0000 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1092 | 1176 | 1279 | 1351 | 1463 | 1531 | 1703 | 1828 | 1939 | 2171 |
| 1 | NaN | Albania | 41.1533 | 20.1683 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 609 | 634 | 663 | 678 | 712 | 726 | 736 | 750 | 766 | 773 |
| 2 | NaN | Algeria | 28.0339 | 1.6596 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2811 | 2910 | 3007 | 3127 | 3256 | 3382 | 3517 | 3649 | 3848 | 4006 |
| 3 | NaN | Andorra | 42.5063 | 1.5218 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 717 | 723 | 723 | 731 | 738 | 738 | 743 | 743 | 743 | 745 |
| 4 | NaN | Angola | -11.2027 | 17.8739 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 24 | 25 | 25 | 25 | 25 | 26 | 27 | 27 | 27 | 27 |
| 5 | NaN | Antigua and Barbuda | 17.0608 | -61.7964 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 23 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 |
| 6 | NaN | Argentina | -38.4161 | -63.6167 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 3031 | 3144 | 3435 | 3607 | 3780 | 3892 | 4003 | 4127 | 4285 | 4428 |
| 7 | NaN | Armenia | 40.0691 | 45.0382 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1401 | 1473 | 1523 | 1596 | 1677 | 1746 | 1808 | 1867 | 1932 | 2066 |
| 8 | Australian Capital Territory | Australia | -35.4735 | 149.0124 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 104 | 104 | 104 | 105 | 106 | 106 | 106 | 106 | 106 | 106 |
| 9 | New South Wales | Australia | -33.8688 | 151.2093 | 0 | 0 | 0 | 0 | 3 | 4 | ... | 2969 | 2971 | 2976 | 2982 | 2994 | 3002 | 3004 | 3016 | 3016 | 3025 |
10 rows × 104 columns
corona_dataset_csv.shape
(266, 104)
corona_dataset_csv.drop(["Lat", "Long"], axis = 1, inplace = True)
corona_dataset_csv.head(10)
| Province/State | Country/Region | 1/22/20 | 1/23/20 | 1/24/20 | 1/25/20 | 1/26/20 | 1/27/20 | 1/28/20 | 1/29/20 | ... | 4/21/20 | 4/22/20 | 4/23/20 | 4/24/20 | 4/25/20 | 4/26/20 | 4/27/20 | 4/28/20 | 4/29/20 | 4/30/20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | Afghanistan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1092 | 1176 | 1279 | 1351 | 1463 | 1531 | 1703 | 1828 | 1939 | 2171 |
| 1 | NaN | Albania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 609 | 634 | 663 | 678 | 712 | 726 | 736 | 750 | 766 | 773 |
| 2 | NaN | Algeria | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2811 | 2910 | 3007 | 3127 | 3256 | 3382 | 3517 | 3649 | 3848 | 4006 |
| 3 | NaN | Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 717 | 723 | 723 | 731 | 738 | 738 | 743 | 743 | 743 | 745 |
| 4 | NaN | Angola | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 24 | 25 | 25 | 25 | 25 | 26 | 27 | 27 | 27 | 27 |
| 5 | NaN | Antigua and Barbuda | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 23 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 | 24 |
| 6 | NaN | Argentina | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 3031 | 3144 | 3435 | 3607 | 3780 | 3892 | 4003 | 4127 | 4285 | 4428 |
| 7 | NaN | Armenia | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1401 | 1473 | 1523 | 1596 | 1677 | 1746 | 1808 | 1867 | 1932 | 2066 |
| 8 | Australian Capital Territory | Australia | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 104 | 104 | 104 | 105 | 106 | 106 | 106 | 106 | 106 | 106 |
| 9 | New South Wales | Australia | 0 | 0 | 0 | 0 | 3 | 4 | 4 | 4 | ... | 2969 | 2971 | 2976 | 2982 | 2994 | 3002 | 3004 | 3016 | 3016 | 3025 |
10 rows × 102 columns
corona_dataset_aggregated = corona_dataset_csv.groupby("Country/Region").sum()
corona_dataset_aggregated.head()
| 1/22/20 | 1/23/20 | 1/24/20 | 1/25/20 | 1/26/20 | 1/27/20 | 1/28/20 | 1/29/20 | 1/30/20 | 1/31/20 | ... | 4/21/20 | 4/22/20 | 4/23/20 | 4/24/20 | 4/25/20 | 4/26/20 | 4/27/20 | 4/28/20 | 4/29/20 | 4/30/20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Country/Region | |||||||||||||||||||||
| Afghanistan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1092 | 1176 | 1279 | 1351 | 1463 | 1531 | 1703 | 1828 | 1939 | 2171 |
| Albania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 609 | 634 | 663 | 678 | 712 | 726 | 736 | 750 | 766 | 773 |
| Algeria | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2811 | 2910 | 3007 | 3127 | 3256 | 3382 | 3517 | 3649 | 3848 | 4006 |
| Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 717 | 723 | 723 | 731 | 738 | 738 | 743 | 743 | 743 | 745 |
| Angola | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 24 | 25 | 25 | 25 | 25 | 26 | 27 | 27 | 27 | 27 |
5 rows × 100 columns
corona_dataset_aggregated.shape
(187, 100)
visualization always helps for better understanding of our data.
corona_dataset_aggregated.loc["China"].plot()
corona_dataset_aggregated.loc["Italy"].plot()
corona_dataset_aggregated.loc["Spain"].plot()
corona_dataset_aggregated.loc["Canada"].plot()
plt.legend()
<matplotlib.legend.Legend at 0x1a20163d10>
we need to find a good measure reperestend as a number, describing the spread of the virus in a country.
corona_dataset_aggregated.loc['Canada'].plot()
<AxesSubplot:>
corona_dataset_aggregated.loc["Canada"][:3].plot()
<AxesSubplot:>
corona_dataset_aggregated.loc["Canada"].diff().plot()
<AxesSubplot:>
corona_dataset_aggregated.loc["Canada"].diff().max()
2778.0
corona_dataset_aggregated.loc["Canada"].diff().max()
2778.0
corona_dataset_aggregated.loc["Spain"].diff().max()
9630.0
countries = list(corona_dataset_aggregated.index)
max_infection_rates = []
for c in countries :
max_infection_rates.append(corona_dataset_aggregated.loc[c].diff().max())
max_infection_rates
[232.0, 34.0, 199.0, 43.0, 5.0, 6.0, 291.0, 134.0, 497.0, 1321.0, 105.0, 7.0, 301.0, 641.0, 12.0, 1485.0, 2454.0, 4.0, 19.0, 1.0, 104.0, 92.0, 7.0, 7502.0, 26.0, 137.0, 41.0, 21.0, 6.0, 45.0, 31.0, 203.0, 2778.0, 31.0, 21.0, 1138.0, 15136.0, 353.0, 1.0, 57.0, 81.0, 37.0, 113.0, 96.0, 63.0, 58.0, 381.0, 391.0, 99.0, 156.0, 5.0, 371.0, 11536.0, 269.0, 32.0, 130.0, 7.0, 134.0, 20.0, 9.0, 5.0, 267.0, 26849.0, 38.0, 5.0, 42.0, 6933.0, 403.0, 156.0, 6.0, 68.0, 167.0, 132.0, 12.0, 10.0, 3.0, 72.0, 210.0, 99.0, 1893.0, 436.0, 3186.0, 91.0, 1515.0, 1131.0, 6557.0, 52.0, 1161.0, 40.0, 264.0, 29.0, 851.0, 289.0, 300.0, 69.0, 3.0, 48.0, 61.0, 17.0, 13.0, 21.0, 90.0, 234.0, 7.0, 14.0, 10.0, 235.0, 190.0, 58.0, 52.0, 2.0, 41.0, 1425.0, 222.0, 12.0, 13.0, 30.0, 281.0, 19.0, 3.0, 14.0, 1346.0, 89.0, 2.0, 69.0, 208.0, 107.0, 386.0, 144.0, 1292.0, 357.0, 5.0, 27.0, 3683.0, 538.0, 545.0, 1516.0, 957.0, 523.0, 7099.0, 22.0, 5.0, 6.0, 4.0, 54.0, 6.0, 1351.0, 87.0, 2379.0, 2.0, 20.0, 1426.0, 114.0, 70.0, 73.0, 354.0, 28.0, 9630.0, 65.0, 67.0, 3.0, 812.0, 1321.0, 6.0, 27.0, 15.0, 181.0, 188.0, 10.0, 14.0, 40.0, 82.0, 5138.0, 36188.0, 11.0, 578.0, 552.0, 8733.0, 48.0, 167.0, 29.0, 19.0, 66.0, 4.0, 5.0, 9.0, 8.0]
countries = list(corona_dataset_aggregated.index)
max_infection_rates = []
for c in countries :
max_infection_rates.append(corona_dataset_aggregated.loc[c].diff().max())
corona_dataset_aggregated["max_infection_rates"] = max_infection_rates
corona_dataset_aggregated.head()
| 1/22/20 | 1/23/20 | 1/24/20 | 1/25/20 | 1/26/20 | 1/27/20 | 1/28/20 | 1/29/20 | 1/30/20 | 1/31/20 | ... | 4/22/20 | 4/23/20 | 4/24/20 | 4/25/20 | 4/26/20 | 4/27/20 | 4/28/20 | 4/29/20 | 4/30/20 | max_infection_rates | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Country/Region | |||||||||||||||||||||
| Afghanistan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1176 | 1279 | 1351 | 1463 | 1531 | 1703 | 1828 | 1939 | 2171 | 232.0 |
| Albania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 634 | 663 | 678 | 712 | 726 | 736 | 750 | 766 | 773 | 34.0 |
| Algeria | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2910 | 3007 | 3127 | 3256 | 3382 | 3517 | 3649 | 3848 | 4006 | 199.0 |
| Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 723 | 723 | 731 | 738 | 738 | 743 | 743 | 743 | 745 | 43.0 |
| Angola | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 25 | 25 | 25 | 25 | 26 | 27 | 27 | 27 | 27 | 5.0 |
5 rows × 101 columns
corona_data = pd.DataFrame(corona_dataset_aggregated["max_infection_rates"])
corona_data.head()
| max_infection_rates | |
|---|---|
| Country/Region | |
| Afghanistan | 232.0 |
| Albania | 34.0 |
| Algeria | 199.0 |
| Andorra | 43.0 |
| Angola | 5.0 |
happiness_report_csv = pd.read_csv("Datasets/worldwide_happiness_report.csv")
happiness_report_csv.head()
| Overall rank | Country or region | Score | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | Generosity | Perceptions of corruption | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Finland | 7.769 | 1.340 | 1.587 | 0.986 | 0.596 | 0.153 | 0.393 |
| 1 | 2 | Denmark | 7.600 | 1.383 | 1.573 | 0.996 | 0.592 | 0.252 | 0.410 |
| 2 | 3 | Norway | 7.554 | 1.488 | 1.582 | 1.028 | 0.603 | 0.271 | 0.341 |
| 3 | 4 | Iceland | 7.494 | 1.380 | 1.624 | 1.026 | 0.591 | 0.354 | 0.118 |
| 4 | 5 | Netherlands | 7.488 | 1.396 | 1.522 | 0.999 | 0.557 | 0.322 | 0.298 |
useless_cols = ["Overall rank", "Score", "Generosity", "Perceptions of corruption"]
happiness_report_csv.drop(useless_cols, axis = 1, inplace = True)
happiness_report_csv.head()
| Country or region | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | |
|---|---|---|---|---|---|
| 0 | Finland | 1.340 | 1.587 | 0.986 | 0.596 |
| 1 | Denmark | 1.383 | 1.573 | 0.996 | 0.592 |
| 2 | Norway | 1.488 | 1.582 | 1.028 | 0.603 |
| 3 | Iceland | 1.380 | 1.624 | 1.026 | 0.591 |
| 4 | Netherlands | 1.396 | 1.522 | 0.999 | 0.557 |
# put the name of the countries as the index
happiness_report_csv.set_index("Country or region", inplace = True)
happiness_report_csv.head()
| GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | |
|---|---|---|---|---|
| Country or region | ||||
| Finland | 1.340 | 1.587 | 0.986 | 0.596 |
| Denmark | 1.383 | 1.573 | 0.996 | 0.592 |
| Norway | 1.488 | 1.582 | 1.028 | 0.603 |
| Iceland | 1.380 | 1.624 | 1.026 | 0.591 |
| Netherlands | 1.396 | 1.522 | 0.999 | 0.557 |
corona_data.head()
| max_infection_rates | |
|---|---|
| Country/Region | |
| Afghanistan | 232.0 |
| Albania | 34.0 |
| Algeria | 199.0 |
| Andorra | 43.0 |
| Angola | 5.0 |
corona_data.shape
(187, 1)
happiness_report_csv.head()
| GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | |
|---|---|---|---|---|
| Country or region | ||||
| Finland | 1.340 | 1.587 | 0.986 | 0.596 |
| Denmark | 1.383 | 1.573 | 0.996 | 0.592 |
| Norway | 1.488 | 1.582 | 1.028 | 0.603 |
| Iceland | 1.380 | 1.624 | 1.026 | 0.591 |
| Netherlands | 1.396 | 1.522 | 0.999 | 0.557 |
happiness_report_csv.shape
(156, 4)
#we need to used inner joing
data = corona_data.join(happiness_report_csv, how ="inner")
data.head()
| max_infection_rates | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | |
|---|---|---|---|---|---|
| Afghanistan | 232.0 | 0.350 | 0.517 | 0.361 | 0.000 |
| Albania | 34.0 | 0.947 | 0.848 | 0.874 | 0.383 |
| Algeria | 199.0 | 1.002 | 1.160 | 0.785 | 0.086 |
| Argentina | 291.0 | 1.092 | 1.432 | 0.881 | 0.471 |
| Armenia | 134.0 | 0.850 | 1.055 | 0.815 | 0.283 |
#see the correlation between the diffferent columns
data.corr()
| max_infection_rates | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | |
|---|---|---|---|---|---|
| max_infection_rates | 1.000000 | 0.250118 | 0.191958 | 0.289263 | 0.078196 |
| GDP per capita | 0.250118 | 1.000000 | 0.759468 | 0.863062 | 0.394603 |
| Social support | 0.191958 | 0.759468 | 1.000000 | 0.765286 | 0.456246 |
| Healthy life expectancy | 0.289263 | 0.863062 | 0.765286 | 1.000000 | 0.427892 |
| Freedom to make life choices | 0.078196 | 0.394603 | 0.456246 | 0.427892 | 1.000000 |
The higher the correlation values, the higher the correlation between those two columns. We can see the factors that correspond to maximum infection rate.
our Analysis is not finished unless we visualize the results in terms figures and graphs so that everyone can understand what you get out of our analysis
data.head()
| max_infection_rates | GDP per capita | Social support | Healthy life expectancy | Freedom to make life choices | |
|---|---|---|---|---|---|
| Afghanistan | 232.0 | 0.350 | 0.517 | 0.361 | 0.000 |
| Albania | 34.0 | 0.947 | 0.848 | 0.874 | 0.383 |
| Algeria | 199.0 | 1.002 | 1.160 | 0.785 | 0.086 |
| Argentina | 291.0 | 1.092 | 1.432 | 0.881 | 0.471 |
| Armenia | 134.0 | 0.850 | 1.055 | 0.815 | 0.283 |
x = data["GDP per capita"]
y = data["max_infection_rates"]
sns.scatterplot(x=x,y=y)
<AxesSubplot:xlabel='GDP per capita', ylabel='max_infection_rates'>
x = data["GDP per capita"]
y = data["max_infection_rates"]
sns.scatterplot(x=x,y=np.log(y))
<AxesSubplot:xlabel='GDP per capita', ylabel='max_infection_rates'>
sns.regplot(x=x, y=np.log(y))
<AxesSubplot:xlabel='GDP per capita', ylabel='max_infection_rates'>
x = data["Social support"]
y = data["max_infection_rates"]
sns.scatterplot(x=x,y=np.log(y))
<AxesSubplot:xlabel='Social support', ylabel='max_infection_rates'>
sns.regplot(x=x, y=np.log(y))
<AxesSubplot:xlabel='Social support', ylabel='max_infection_rates'>
x = data["Healthy life expectancy"]
y = data["max_infection_rates"]
sns.scatterplot(x=x,y=np.log(y))
<AxesSubplot:xlabel='Healthy life expectancy', ylabel='max_infection_rates'>
sns.regplot(x=x, y=np.log(y))
<AxesSubplot:xlabel='Healthy life expectancy', ylabel='max_infection_rates'>
x = data['Freedom to make life choices']
y = data['max_infection_rates']
sns.scatterplot(x=x,y=np.log(y))
<AxesSubplot:xlabel='Freedom to make life choices', ylabel='max_infection_rates'>
sns.regplot(x=x,y=np.log(y))
<AxesSubplot:xlabel='Freedom to make life choices', ylabel='max_infection_rates'>
These visualizations are based on data as of May 25, and November 8, 2020. I have used the daily report data published by John Hopkins University for May 25, 2020. The next part of the code deals with loading the .csv data to our project.
#load data
df = pd.read_csv("Datasets/05-25-2020.csv")
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3410 entries, 0 to 3409 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FIPS 3006 non-null float64 1 Admin2 3009 non-null object 2 Province_State 3232 non-null object 3 Country_Region 3410 non-null object 4 Last_Update 3410 non-null object 5 Lat 3341 non-null float64 6 Long_ 3341 non-null float64 7 Confirmed 3410 non-null int64 8 Deaths 3410 non-null int64 9 Recovered 3410 non-null int64 10 Active 3410 non-null int64 11 Combined_Key 3410 non-null object dtypes: float64(3), int64(4), object(5) memory usage: 319.8+ KB
| FIPS | Admin2 | Province_State | Country_Region | Last_Update | Lat | Long_ | Confirmed | Deaths | Recovered | Active | Combined_Key | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45001.0 | Abbeville | South Carolina | US | 2020-05-26 02:32:35 | 34.223334 | -82.461707 | 35 | 0 | 0 | 35 | Abbeville, South Carolina, US |
| 1 | 22001.0 | Acadia | Louisiana | US | 2020-05-26 02:32:35 | 30.295065 | -92.414197 | 299 | 18 | 0 | 281 | Acadia, Louisiana, US |
| 2 | 51001.0 | Accomack | Virginia | US | 2020-05-26 02:32:35 | 37.767072 | -75.632346 | 731 | 11 | 0 | 720 | Accomack, Virginia, US |
| 3 | 16001.0 | Ada | Idaho | US | 2020-05-26 02:32:35 | 43.452658 | -116.241552 | 796 | 23 | 0 | 773 | Ada, Idaho, US |
| 4 | 19001.0 | Adair | Iowa | US | 2020-05-26 02:32:35 | 41.330756 | -94.471059 | 8 | 0 | 0 | 8 | Adair, Iowa, US |
Preprocessing the data
Now since our data has loaded successfully, the next step is to preprocess the data before using it for plotting. It will include :
df.drop(['FIPS', 'Admin2','Last_Update','Province_State', 'Combined_Key'], axis=1, inplace=True)
df.rename(columns={'Country_Region': "Country"}, inplace=True)
df.head()
| Country | Lat | Long_ | Confirmed | Deaths | Recovered | Active | |
|---|---|---|---|---|---|---|---|
| 0 | US | 34.223334 | -82.461707 | 35 | 0 | 0 | 35 |
| 1 | US | 30.295065 | -92.414197 | 299 | 18 | 0 | 281 |
| 2 | US | 37.767072 | -75.632346 | 731 | 11 | 0 | 720 |
| 3 | US | 43.452658 | -116.241552 | 796 | 23 | 0 | 773 |
| 4 | US | 41.330756 | -94.471059 | 8 | 0 | 0 | 8 |
The data can be grouped together by the ‘groupby’ function of the dataframe. It is similar to the GROUPBY statement in SQL.
### group the data by country
world = df.groupby("Country")['Confirmed','Active','Recovered','Deaths'].sum().reset_index()
world.head()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. This is separate from the ipykernel package so we can avoid doing imports until
| Country | Confirmed | Active | Recovered | Deaths | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 11173 | 9857 | 1097 | 219 |
| 1 | Albania | 1004 | 177 | 795 | 32 |
| 2 | Algeria | 8503 | 3147 | 4747 | 609 |
| 3 | Andorra | 763 | 49 | 663 | 51 |
| 4 | Angola | 70 | 48 | 18 | 4 |
Plotting the top 20 countries with the maximum number of confirmed cases
### Find top 20 countries with maximum number of confirmed cases
top_20 = world.sort_values(by=['Confirmed'], ascending=False).head(20)
### Generate a Barplot
plt.figure(figsize=(12,10))
plot = sns.barplot(top_20['Confirmed'], top_20['Country'])
for i,(value,name) in enumerate(zip(top_20['Confirmed'],top_20['Country'])):
plot.text(value,i-0.05,f'{value:,.0f}',size=10)
plt.show()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
Plotting Confirmed and Active cases for the top 5 countries with the maximum number of confirmed cases
top_10 = world.sort_values(by=['Confirmed'], ascending=False).head(10)
### Generate a Barplot
plt.figure(figsize=(15,10))
confirmed = sns.barplot(top_10['Confirmed'], top_10['Country'], color = 'red', label='Confirmed')
recovered = sns.barplot(top_10['Recovered'], top_10['Country'], color = 'green', label='Recovered')
### Adding Texts for barplots
for i,(value,name) in enumerate(zip(top_10['Confirmed'],top_10['Country'])):
confirmed.text(value,i-0.05,f'{value:,.0f}',size=9)
for i,(value,name) in enumerate(zip(top_10['Recovered'],top_10['Country'])):
recovered.text(value,i-0.05,f'{value:,.0f}',size=9)
plt.legend(loc=4)
plt.show()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
Plotting a Choropleth map on World Map
A choropleth map is a type of thematic map in which areas are shaded or patterned in proportion to a statistical variable that represents an aggregate summary of a geographic characteristic within each area, such as population density or per-capita income.
Choropleth maps provide an easy way to visualize how a measurement varies across a geographic area or show the level of variability within a region
figure = px.choropleth(world,locations='Country', locationmode='country names', color='Confirmed', hover_name='Country', color_continuous_scale='tealgrn', range_color=[1,1000000],title='Countries with Confirmed cases')
figure.show()
#Read data
df = pd.read_csv("Datasets/11-08-2020.csv")
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3960 entries, 0 to 3959 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FIPS 3262 non-null float64 1 Admin2 3267 non-null object 2 Province_State 3790 non-null object 3 Country_Region 3960 non-null object 4 Last_Update 3960 non-null object 5 Lat 3878 non-null float64 6 Long_ 3878 non-null float64 7 Confirmed 3960 non-null int64 8 Deaths 3960 non-null int64 9 Recovered 3960 non-null int64 10 Active 3958 non-null float64 11 Combined_Key 3960 non-null object 12 Incidence_Rate 3878 non-null float64 13 Case-Fatality_Ratio 3917 non-null float64 dtypes: float64(6), int64(3), object(5) memory usage: 433.2+ KB
| FIPS | Admin2 | Province_State | Country_Region | Last_Update | Lat | Long_ | Confirmed | Deaths | Recovered | Active | Combined_Key | Incidence_Rate | Case-Fatality_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | Afghanistan | 2020-11-09 05:25:45 | 33.93911 | 67.709953 | 42092 | 1558 | 34458 | 6076.0 | Afghanistan | 108.126879 | 3.701416 |
| 1 | NaN | NaN | NaN | Albania | 2020-11-09 05:25:45 | 41.15330 | 20.168300 | 24206 | 559 | 12092 | 11555.0 | Albania | 841.128640 | 2.309345 |
| 2 | NaN | NaN | NaN | Algeria | 2020-11-09 05:25:45 | 28.03390 | 1.659600 | 62051 | 2048 | 42037 | 17966.0 | Algeria | 141.504046 | 3.300511 |
| 3 | NaN | NaN | NaN | Andorra | 2020-11-09 05:25:45 | 42.50630 | 1.521800 | 5383 | 75 | 4248 | 1060.0 | Andorra | 6966.931987 | 1.393275 |
| 4 | NaN | NaN | NaN | Angola | 2020-11-09 05:25:45 | -11.20270 | 17.873900 | 12433 | 307 | 5899 | 6227.0 | Angola | 37.829059 | 2.469235 |
df.drop(['FIPS', 'Admin2','Last_Update','Province_State', 'Combined_Key'], axis=1, inplace=True)
df.rename(columns={'Country_Region': "Country"}, inplace=True)
df.head()
| Country | Lat | Long_ | Confirmed | Deaths | Recovered | Active | Incidence_Rate | Case-Fatality_Ratio | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 33.93911 | 67.709953 | 42092 | 1558 | 34458 | 6076.0 | 108.126879 | 3.701416 |
| 1 | Albania | 41.15330 | 20.168300 | 24206 | 559 | 12092 | 11555.0 | 841.128640 | 2.309345 |
| 2 | Algeria | 28.03390 | 1.659600 | 62051 | 2048 | 42037 | 17966.0 | 141.504046 | 3.300511 |
| 3 | Andorra | 42.50630 | 1.521800 | 5383 | 75 | 4248 | 1060.0 | 6966.931987 | 1.393275 |
| 4 | Angola | -11.20270 | 17.873900 | 12433 | 307 | 5899 | 6227.0 | 37.829059 | 2.469235 |
### group the data by country
world = df.groupby("Country")['Confirmed','Active','Recovered','Deaths'].sum().reset_index()
world.head()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
| Country | Confirmed | Active | Recovered | Deaths | |
|---|---|---|---|---|---|
| 0 | Afghanistan | 42092 | 6076.0 | 34458 | 1558 |
| 1 | Albania | 24206 | 11555.0 | 12092 | 559 |
| 2 | Algeria | 62051 | 17966.0 | 42037 | 2048 |
| 3 | Andorra | 5383 | 1060.0 | 4248 | 75 |
| 4 | Angola | 12433 | 6227.0 | 5899 | 307 |
### Find top 20 countries with maximum number of confirmed cases
top_20 = world.sort_values(by=['Confirmed'], ascending=False).head(20)
### Generate a Barplot
plt.figure(figsize=(12,10))
plot = sns.barplot(top_20['Confirmed'], top_20['Country'])
for i,(value,name) in enumerate(zip(top_20['Confirmed'],top_20['Country'])):
plot.text(value,i-0.05,f'{value:,.0f}',size=10)
plt.show()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
top_10 = world.sort_values(by=['Confirmed'], ascending=False).head(10)
### Generate a Barplot
plt.figure(figsize=(15,10))
confirmed = sns.barplot(top_10['Confirmed'], top_10['Country'], color = 'red', label='Confirmed')
recovered = sns.barplot(top_10['Recovered'], top_10['Country'], color = 'green', label='Recovered')
### Adding Texts for barplots
for i,(value,name) in enumerate(zip(top_10['Confirmed'],top_10['Country'])):
confirmed.text(value,i-0.05,f'{value:,.0f}',size=9)
for i,(value,name) in enumerate(zip(top_10['Recovered'],top_10['Country'])):
recovered.text(value,i-0.05,f'{value:,.0f}',size=9)
plt.legend(loc=4)
plt.show()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. /Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
figure = px.choropleth(world,locations='Country', locationmode='country names', color='Confirmed', hover_name='Country', color_continuous_scale='tealgrn', range_color=[1,1000000],title='Countries with Confirmed cases')
figure.show()
figure = px.choropleth(world,locations='Country', locationmode='country names', color='Deaths',
hover_name='Country', color_continuous_scale='tealgrn', range_color=[1,1000000],title='Countries with Death cases')
figure.show()
figure = px.choropleth(world,locations='Country', locationmode='country names', color='Recovered', hover_name='Country', color_continuous_scale='tealgrn', range_color=[1,1000000],title='Countries with Recovered cases')
figure.show()
full_grouped = pd.read_csv('Datasets/full_grouped.csv')
full_grouped['Date'] = pd.to_datetime(full_grouped['Date'])
full_grouped.head()
| Date | Country/Region | Confirmed | Deaths | Recovered | Active | New cases | New deaths | New recovered | WHO Region | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-22 | Afghanistan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Eastern Mediterranean |
| 1 | 2020-01-22 | Albania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe |
| 2 | 2020-01-22 | Algeria | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa |
| 3 | 2020-01-22 | Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe |
| 4 | 2020-01-22 | Angola | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa |
full_grouped.shape
(35156, 10)
# Over the time
fig = px.choropleth(full_grouped, locations="Country/Region",
color=np.log(full_grouped["Confirmed"]),
locationmode='country names', hover_name="Country/Region",
animation_frame=full_grouped["Date"].dt.strftime('%Y-%m-%d'),
title='Cases over time', color_continuous_scale=px.colors.sequential.matter)
fig.update(layout_coloraxis_showscale=False)
fig.show()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/pandas/core/series.py:726: RuntimeWarning: divide by zero encountered in log
# color pallette
cnf, dth, rec, act = '#393e46', '#ff2e63', '#21bf73', '#fe9801'
temp = full_grouped.groupby('Date')['Recovered', 'Deaths', 'Active'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['Recovered', 'Deaths', 'Active'],var_name='Case', value_name='Count')
temp.head()
fig = px.area(temp, x="Date", y="Count", color='Case', height=600, width=700,title='Cases over time', color_discrete_sequence = [rec, dth, act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
def plot_stacked(col):
fig = px.bar(full_grouped, x="Date", y=col, color='Country/Region',
height=600, title=col,
color_discrete_sequence = px.colors.cyclical.mygbm)
fig.update_layout(showlegend=True)
fig.show()
def plot_line(col):
fig = px.line(full_grouped, x="Date", y=col, color='Country/Region',
height=600, title=col,color_discrete_sequence = px.colors.cyclical.mygbm)
fig.update_layout(showlegend=True)
fig.show()
plot_stacked('Confirmed')
plot_stacked('Deaths')
plot_stacked('New cases')
plot_stacked('Active')
plot_line('Confirmed')
plot_line('Deaths')
plot_line('New cases')
plot_line('Active')
"""
def gt_n(n):
countries = full_grouped[full_grouped['Confirmed']>n]['Country/Region'].unique()
temp = full_table[full_table['Country/Region'].isin(countries)]
temp = temp.groupby(['Country/Region', 'Date'])['Confirmed'].sum().reset_index()
temp = temp[temp['Confirmed']>n]
# print(temp.head())
min_date = temp.groupby('Country/Region')['Date'].min().reset_index()
min_date.columns = ['Country/Region', 'Min Date']
# print(min_date.head())
from_nth_case = pd.merge(temp, min_date, on='Country/Region')
from_nth_case['Date'] = pd.to_datetime(from_nth_case['Date'])
from_nth_case['Min Date'] = pd.to_datetime(from_nth_case['Min Date'])
from_nth_case['N days'] = (from_nth_case['Date'] - from_nth_case['MinDate']).dt.days
# print(from_nth_case.head())
fig = px.line(from_nth_case, x='N days', y='Confirmed', color='Country/Region',
title='N days from '+str(n)+' case', height=600)
fig.show()
"""
"\ndef gt_n(n):\n countries = full_grouped[full_grouped['Confirmed']>n]['Country/Region'].unique()\n temp = full_table[full_table['Country/Region'].isin(countries)]\n temp = temp.groupby(['Country/Region', 'Date'])['Confirmed'].sum().reset_index()\n temp = temp[temp['Confirmed']>n]\n # print(temp.head())\n min_date = temp.groupby('Country/Region')['Date'].min().reset_index()\n min_date.columns = ['Country/Region', 'Min Date']\n # print(min_date.head())\n from_nth_case = pd.merge(temp, min_date, on='Country/Region')\n from_nth_case['Date'] = pd.to_datetime(from_nth_case['Date'])\n from_nth_case['Min Date'] = pd.to_datetime(from_nth_case['Min Date'])\n from_nth_case['N days'] = (from_nth_case['Date'] - from_nth_case['MinDate']).dt.days\n # print(from_nth_case.head())\n fig = px.line(from_nth_case, x='N days', y='Confirmed', color='Country/Region',\n title='N days from '+str(n)+' case', height=600)\n fig.show()\n"
#gt_n(100000)
def plot_bubble(col, pal):
temp = full_grouped[full_grouped[col]>0].sort_values('Country/Region',ascending=False)
fig = px.scatter(temp, x='Date', y='Country/Region', size=col, color=col,height=3000,
color_continuous_scale=pal)
fig.update_layout(yaxis = dict(dtick = 1))
fig.update(layout_coloraxis_showscale=False)
fig.show()
plot_bubble('New cases', 'Viridis')
plot_bubble('Active', 'Viridis')
temp = full_grouped[['Date', 'Country/Region', 'New cases']]
temp['New cases reported ?'] = temp['New cases']!=0
temp['New cases reported ?'] = temp['New cases reported ?'].astype(int)
# temp.head()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import plotly.graph_objs as go
fig = go.Figure(data=go.Heatmap(
z=temp['New cases reported ?'],
x=temp['Date'],
y=temp['Country/Region'],
colorscale='Emrld',
showlegend=False,
text=temp['New cases reported ?']))
fig.update_layout(yaxis = dict(dtick = 1))
fig.update_layout(height=3000)
fig.show()
full_grouped['Week No.'] = full_grouped['Date'].dt.strftime('%U')
week_wise = full_grouped.groupby('Week No.')['Confirmed', 'Deaths',
'Recovered', 'Active', 'New cases',
'New deaths', 'New recovered'].sum().reset_index()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
def plot_weekwise(col, hue):
fig = px.bar(week_wise, x="Week No.", y=col, width=700,color_discrete_sequence=[hue])
fig.update_layout(title=col, xaxis_title="", yaxis_title="")
fig.show()
plot_weekwise('Confirmed', '#000000')
plot_weekwise('Deaths', dth)
plot_weekwise('New cases', '#cd6684')
full_grouped['Month'] = pd.DatetimeIndex(full_grouped['Date']).month
month_wise = full_grouped.groupby('Month')['Confirmed', 'Deaths', 'Recovered',
'Active', 'New cases', 'New deaths',
'New recovered'].sum().reset_index()
/Users/eamankwah/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
def plot_monthwise(col, hue):
fig = px.bar(month_wise, x="Month", y=col, width=700,color_discrete_sequence=[hue])
fig.update_layout(title=col, xaxis_title="", yaxis_title="")
fig.show()
plot_monthwise('Confirmed', '#000000')
plot_monthwise('Deaths', dth)
plot_monthwise('New cases', '#cd6684')